{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-05-01T02:51:49.344531Z",
     "start_time": "2017-05-01T10:51:49.304391+08:00"
    },
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from keras.models import Sequential\n",
    "from keras.layers.core import Flatten, Dense, Dropout, Reshape, Lambda\n",
    "from keras.layers.convolutional import Conv2D, Convolution2D, MaxPooling2D, ZeroPadding2D\n",
    "from keras.optimizers import SGD\n",
    "import tensorflow as tf\n",
    "from keras import backend as K\n",
    "from IPython.display import SVG\n",
    "from keras.utils.vis_utils import model_to_dot\n",
    "from keras.layers.advanced_activations import LeakyReLU\n",
    "from keras.layers import Input, Dense\n",
    "from keras.models import Model\n",
    "import matplotlib.pyplot as plt\n",
    "from keras.callbacks import TensorBoard, EarlyStopping, ModelCheckpoint\n",
    "import copy\n",
    "import cv2, os\n",
    "import numpy as np\n",
    "from random import shuffle\n",
    "get_ipython().magic(u'matplotlib inline')\n",
    "\n",
    "os.environ[\"CUDA_DEVICE_ORDER\"]=\"PCI_BUS_ID\"\n",
    "os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"1\"\n",
    "\n",
    "BIN, OVERLAP = 2, 0.1\n",
    "W = 1.\n",
    "ALPHA = 1.\n",
    "MAX_JIT = 3\n",
    "NORM_H, NORM_W = 224, 224\n",
    "VEHICLES = ['Car', 'Truck', 'Van', 'Tram']\n",
    "BATCH_SIZE = 8\n",
    "\n",
    "label_dir = '/home/husky/data/kitti_object/data_object_image_2/training/label_2/'\n",
    "image_dir = '/home/husky/data/kitti_object/data_object_image_2/training/image_2/'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Preprocessing stuff"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-05-01T02:51:50.559709Z",
     "start_time": "2017-05-01T10:51:49.347095+08:00"
    },
    "code_folding": [
     88
    ],
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def compute_anchors(angle):\n",
    "    anchors = []\n",
    "    \n",
    "    wedge = 2.*np.pi/BIN\n",
    "    l_index = int(angle/wedge)\n",
    "    r_index = l_index + 1\n",
    "    \n",
    "    if (angle - l_index*wedge) < wedge/2 * (1+OVERLAP/2):\n",
    "        anchors.append([l_index, angle - l_index*wedge])\n",
    "        \n",
    "    if (r_index*wedge - angle) < wedge/2 * (1+OVERLAP/2):\n",
    "        anchors.append([r_index%BIN, angle - r_index*wedge])\n",
    "        \n",
    "    return anchors\n",
    "\n",
    "def compute_angle(anchors):\n",
    "    pass\n",
    "\n",
    "def parse_annotation(label_dir, image_dir):\n",
    "    all_objs = []\n",
    "    dims_avg = {key:np.array([0, 0, 0]) for key in VEHICLES}\n",
    "    dims_cnt = {key:0 for key in VEHICLES}\n",
    "        \n",
    "    for label_file in os.listdir(label_dir):\n",
    "        image_file = label_file.replace('txt', 'png')\n",
    "\n",
    "        for line in open(label_dir + label_file).readlines():\n",
    "            line = line.strip().split(' ')\n",
    "            truncated = np.abs(float(line[1]))\n",
    "            occluded  = np.abs(float(line[2]))\n",
    "\n",
    "            if line[0] in VEHICLES and truncated < 0.1 and occluded < 0.1:\n",
    "                new_alpha = float(line[3]) + np.pi/2.\n",
    "                if new_alpha < 0:\n",
    "                    new_alpha = new_alpha + 2.*np.pi\n",
    "                new_alpha = new_alpha - int(new_alpha/(2.*np.pi))*(2.*np.pi)\n",
    "\n",
    "                obj = {'name':line[0],\n",
    "                       'image':image_file,\n",
    "                       'xmin':int(float(line[4])),\n",
    "                       'ymin':int(float(line[5])),\n",
    "                       'xmax':int(float(line[6])),\n",
    "                       'ymax':int(float(line[7])),\n",
    "                       'dims':np.array([float(number) for number in line[8:11]]),\n",
    "                       'new_alpha': new_alpha\n",
    "                      }\n",
    "                \n",
    "                dims_avg[obj['name']]  = dims_cnt[obj['name']]*dims_avg[obj['name']] + obj['dims']\n",
    "                dims_cnt[obj['name']] += 1\n",
    "                dims_avg[obj['name']] /= dims_cnt[obj['name']]\n",
    "\n",
    "                all_objs.append(obj)\n",
    "            \n",
    "    return all_objs, dims_avg\n",
    "\n",
    "all_objs, dims_avg = parse_annotation(label_dir, image_dir)\n",
    "\n",
    "for obj in all_objs:\n",
    "    # Fix dimensions\n",
    "    obj['dims'] = obj['dims'] - dims_avg[obj['name']]\n",
    "    \n",
    "    # Fix orientation and confidence for no flip\n",
    "    orientation = np.zeros((BIN,2))\n",
    "    confidence = np.zeros(BIN)\n",
    "    \n",
    "    anchors = compute_anchors(obj['new_alpha'])\n",
    "    \n",
    "    for anchor in anchors:\n",
    "        orientation[anchor[0]] = np.array([np.cos(anchor[1]), np.sin(anchor[1])])\n",
    "        confidence[anchor[0]] = 1.\n",
    "        \n",
    "    confidence = confidence / np.sum(confidence)\n",
    "        \n",
    "    obj['orient'] = orientation\n",
    "    obj['conf'] = confidence\n",
    "        \n",
    "    # Fix orientation and confidence for flip\n",
    "    orientation = np.zeros((BIN,2))\n",
    "    confidence = np.zeros(BIN)\n",
    "    \n",
    "    anchors = compute_anchors(2.*np.pi - obj['new_alpha'])\n",
    "    \n",
    "    for anchor in anchors:\n",
    "        orientation[anchor[0]] = np.array([np.cos(anchor[1]), np.sin(anchor[1])])\n",
    "        confidence[anchor[0]] = 1\n",
    "        \n",
    "    confidence = confidence / np.sum(confidence)\n",
    "        \n",
    "    obj['orient_flipped'] = orientation\n",
    "    obj['conf_flipped'] = confidence\n",
    "\n",
    "def prepare_input_and_output(train_inst):\n",
    "    ### Prepare image patch\n",
    "    xmin = train_inst['xmin'] #+ np.random.randint(-MAX_JIT, MAX_JIT+1)\n",
    "    ymin = train_inst['ymin'] #+ np.random.randint(-MAX_JIT, MAX_JIT+1)\n",
    "    xmax = train_inst['xmax'] #+ np.random.randint(-MAX_JIT, MAX_JIT+1)\n",
    "    ymax = train_inst['ymax'] #+ np.random.randint(-MAX_JIT, MAX_JIT+1)\n",
    "    \n",
    "    img = cv2.imread(image_dir + train_inst['image'])\n",
    "    img = copy.deepcopy(img[ymin:ymax+1,xmin:xmax+1]).astype(np.float32)\n",
    "    \n",
    "    # re-color the image\n",
    "    #img += np.random.randint(-2, 3, img.shape).astype('float32')\n",
    "    #t  = [np.random.uniform()]\n",
    "    #t += [np.random.uniform()]\n",
    "    #t += [np.random.uniform()]\n",
    "    #t = np.array(t)\n",
    "\n",
    "    #img = img * (1 + t)\n",
    "    #img = img / (255. * 2.)\n",
    "\n",
    "    # flip the image\n",
    "    flip = np.random.binomial(1, .5)\n",
    "    if flip > 0.5: img = cv2.flip(img, 1)\n",
    "        \n",
    "    # resize the image to standard size\n",
    "    img = cv2.resize(img, (NORM_H, NORM_W))\n",
    "    img = img - np.array([[[103.939, 116.779, 123.68]]])\n",
    "    #img = img[:,:,::-1]\n",
    "    \n",
    "    ### Fix orientation and confidence\n",
    "    if flip > 0.5:\n",
    "        return img, train_inst['dims'], train_inst['orient_flipped'], train_inst['conf_flipped']\n",
    "    else:\n",
    "        return img, train_inst['dims'], train_inst['orient'], train_inst['conf']\n",
    "\n",
    "def data_gen(all_objs, batch_size):\n",
    "    num_obj = len(all_objs)\n",
    "    \n",
    "    keys = range(num_obj)\n",
    "    np.random.shuffle(keys)\n",
    "    \n",
    "    l_bound = 0\n",
    "    r_bound = batch_size if batch_size < num_obj else num_obj\n",
    "    \n",
    "    while True:\n",
    "        if l_bound == r_bound:\n",
    "            l_bound  = 0\n",
    "            r_bound = batch_size if batch_size < num_obj else num_obj\n",
    "            np.random.shuffle(keys)\n",
    "        \n",
    "        currt_inst = 0\n",
    "        x_batch = np.zeros((r_bound - l_bound, 224, 224, 3))\n",
    "        d_batch = np.zeros((r_bound - l_bound, 3))\n",
    "        o_batch = np.zeros((r_bound - l_bound, BIN, 2))\n",
    "        c_batch = np.zeros((r_bound - l_bound, BIN))\n",
    "        \n",
    "        for key in keys[l_bound:r_bound]:\n",
    "            # augment input image and fix object's orientation and confidence\n",
    "            image, dimension, orientation, confidence = prepare_input_and_output(all_objs[key])\n",
    "            \n",
    "            #plt.figure(figsize=(5,5))\n",
    "            #plt.imshow(image/255./2.); plt.show()\n",
    "            #print dimension\n",
    "            #print orientation\n",
    "            #print confidence\n",
    "            \n",
    "            x_batch[currt_inst, :] = image\n",
    "            d_batch[currt_inst, :] = dimension\n",
    "            o_batch[currt_inst, :] = orientation\n",
    "            c_batch[currt_inst, :] = confidence\n",
    "            \n",
    "            currt_inst += 1\n",
    "                \n",
    "        yield x_batch, [d_batch, o_batch, c_batch]\n",
    "        \n",
    "        l_bound  = r_bound\n",
    "        r_bound = r_bound + batch_size\n",
    "        if r_bound > num_obj: r_bound = num_obj\n",
    "\n",
    "def l2_normalize(x):\n",
    "    return tf.nn.l2_normalize(x, dim=2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true
   },
   "source": [
    "# Construct the regression network"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-05-01T02:51:50.907880Z",
     "start_time": "2017-05-01T10:51:50.562231+08:00"
    },
    "code_folding": [],
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "# Construct the network\n",
    "inputs = Input(shape=(224,224,3))\n",
    "# Block 1\n",
    "x = Conv2D(64, (3, 3), activation='relu', padding='same', name='block1_conv1')(inputs)\n",
    "x = Conv2D(64, (3, 3), activation='relu', padding='same', name='block1_conv2')(x)\n",
    "x = MaxPooling2D((2, 2), strides=(2, 2), name='block1_pool')(x)\n",
    "\n",
    "# Block 2\n",
    "x = Conv2D(128, (3, 3), activation='relu', padding='same', name='block2_conv1')(x)\n",
    "x = Conv2D(128, (3, 3), activation='relu', padding='same', name='block2_conv2')(x)\n",
    "x = MaxPooling2D((2, 2), strides=(2, 2), name='block2_pool')(x)\n",
    "\n",
    "# Block 3\n",
    "x = Conv2D(256, (3, 3), activation='relu', padding='same', name='block3_conv1')(x)\n",
    "x = Conv2D(256, (3, 3), activation='relu', padding='same', name='block3_conv2')(x)\n",
    "x = Conv2D(256, (3, 3), activation='relu', padding='same', name='block3_conv3')(x)\n",
    "x = MaxPooling2D((2, 2), strides=(2, 2), name='block3_pool')(x)\n",
    "\n",
    "# Block 4\n",
    "x = Conv2D(512, (3, 3), activation='relu', padding='same', name='block4_conv1')(x)\n",
    "x = Conv2D(512, (3, 3), activation='relu', padding='same', name='block4_conv2')(x)\n",
    "x = Conv2D(512, (3, 3), activation='relu', padding='same', name='block4_conv3')(x)\n",
    "x = MaxPooling2D((2, 2), strides=(2, 2), name='block4_pool')(x)\n",
    "\n",
    "# Block 5\n",
    "x = Conv2D(512, (3, 3), activation='relu', padding='same', name='block5_conv1')(x)\n",
    "x = Conv2D(512, (3, 3), activation='relu', padding='same', name='block5_conv2')(x)\n",
    "x = Conv2D(512, (3, 3), activation='relu', padding='same', name='block5_conv3')(x)\n",
    "x = MaxPooling2D((2, 2), strides=(2, 2), name='block5_pool')(x)\n",
    "\n",
    "x = Flatten()(x)\n",
    "\n",
    "dimension   = Dense(512)(x)\n",
    "dimension   = LeakyReLU(alpha=0.1)(dimension)\n",
    "dimension   = Dropout(0.5)(dimension)\n",
    "dimension   = Dense(3)(dimension)\n",
    "dimension   = LeakyReLU(alpha=0.1, name='dimension')(dimension)\n",
    "\n",
    "orientation = Dense(256)(x)\n",
    "orientation = LeakyReLU(alpha=0.1)(orientation)\n",
    "orientation = Dropout(0.5)(orientation)\n",
    "orientation = Dense(BIN*2)(orientation)\n",
    "orientation = LeakyReLU(alpha=0.1)(orientation)\n",
    "orientation = Reshape((BIN,-1))(orientation)\n",
    "orientation = Lambda(l2_normalize, name='orientation')(orientation)\n",
    "\n",
    "confidence  = Dense(256)(x)\n",
    "confidence  = LeakyReLU(alpha=0.1)(confidence)\n",
    "confidence  = Dropout(0.5)(confidence)\n",
    "confidence  = Dense(BIN, activation='softmax', name='confidence')(confidence)\n",
    "\n",
    "model = Model(inputs, outputs=[dimension, orientation, confidence])\n",
    "\n",
    "#model.load_weights('initial_weights.h5')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true
   },
   "source": [
    "# Kick off the trainning"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-05-01T02:51:50.917482Z",
     "start_time": "2017-05-01T10:51:50.909746+08:00"
    },
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "def orientation_loss(y_true, y_pred):\n",
    "    # Find number of anchors\n",
    "    anchors = tf.reduce_sum(tf.square(y_true), axis=2)\n",
    "    anchors = tf.greater(anchors, tf.constant(0.5))\n",
    "    anchors = tf.reduce_sum(tf.cast(anchors, tf.float32), 1)\n",
    "    \n",
    "    # Define the loss\n",
    "    loss = -(y_true[:,:,0]*y_pred[:,:,0] + y_true[:,:,1]*y_pred[:,:,1])\n",
    "    loss = tf.reduce_sum(loss, axis=1)\n",
    "    loss = loss / anchors\n",
    "    \n",
    "    return tf.reduce_mean(loss)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-05-01T02:51:51.068857Z",
     "start_time": "2017-05-01T10:51:50.919082+08:00"
    },
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "early_stop  = EarlyStopping(monitor='val_loss', min_delta=0.001, patience=10, mode='min', verbose=1)\n",
    "checkpoint  = ModelCheckpoint('weights.hdf5', monitor='val_loss', verbose=1, save_best_only=True, mode='min', period=1)\n",
    "tensorboard = TensorBoard(log_dir='../logs/', histogram_freq=0, write_graph=True, write_images=False)\n",
    "\n",
    "all_exams  = len(all_objs)\n",
    "trv_split  = int(0.9*all_exams)\n",
    "batch_size = 8\n",
    "np.random.shuffle(all_objs)\n",
    "\n",
    "train_gen = data_gen(all_objs[:trv_split],          batch_size)\n",
    "valid_gen = data_gen(all_objs[trv_split:all_exams], batch_size)\n",
    "\n",
    "train_num = int(np.ceil(trv_split/batch_size))\n",
    "valid_num = int(np.ceil((all_exams - trv_split)/batch_size))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-04-23T07:59:34.809443Z",
     "start_time": "2017-04-23T15:57:59.557392+08:00"
    },
    "collapsed": true,
    "hidden": true
   },
   "outputs": [],
   "source": [
    "minimizer  = SGD(lr=0.0001)\n",
    "model.compile(optimizer='adam',#minimizer,\n",
    "              loss={'dimension': 'mean_squared_error', 'orientation': orientation_loss, 'confidence': 'mean_squared_error'},\n",
    "                  loss_weights={'dimension': 1., 'orientation': 1., 'confidence': 1.})\n",
    "model.fit_generator(generator = train_gen, \n",
    "                    steps_per_epoch = train_num, \n",
    "                    epochs = 500, \n",
    "                    verbose = 1, \n",
    "                    validation_data = valid_gen, \n",
    "                    validation_steps = valid_num, \n",
    "                    callbacks = [early_stop, checkpoint, tensorboard], \n",
    "                    max_q_size = 3)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Regress 3D boxes on a video"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-05-01T03:30:45.190841Z",
     "start_time": "2017-05-01T11:30:45.182191+08:00"
    },
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "model.load_weights('weights.h5')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 132,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-05-01T03:51:07.574745Z",
     "start_time": "2017-05-01T11:48:43.975138+08:00"
    },
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "image_dir = '/home/husky/github/vehicle-detector/2011_09_26/2011_09_26_drive_0009_sync/image_02/data/'\n",
    "box2d_loc = '/home/husky/github/vehicle-detector/2011_09_26/2011_09_26_drive_0009_sync/box_2d/'\n",
    "box3d_loc = '/home/husky/github/vehicle-detector/2011_09_26/2011_09_26_drive_0009_sync/box_3d/'\n",
    "\n",
    "all_image = sorted(os.listdir(image_dir))\n",
    "#np.random.shuffle(all_image)\n",
    "\n",
    "for f in all_image:\n",
    "    image_file = image_dir + f\n",
    "    box2d_file = box2d_loc + f.replace('png', 'txt')\n",
    "    box3d_file = box3d_loc + f.replace('png', 'txt')\n",
    "    \n",
    "    with open(box3d_file, 'w') as box3d:\n",
    "        img = cv2.imread(image_file)\n",
    "        img = img.astype(np.float32, copy=False)\n",
    "\n",
    "        for line in open(box2d_file):\n",
    "            line = line.strip().split(' ')\n",
    "            truncated = np.abs(float(line[1]))\n",
    "            occluded  = np.abs(float(line[2]))\n",
    "\n",
    "            obj = {'xmin':int(float(line[4])),\n",
    "                   'ymin':int(float(line[5])),\n",
    "                   'xmax':int(float(line[6])),\n",
    "                   'ymax':int(float(line[7])),\n",
    "                  }\n",
    "\n",
    "            patch = img[obj['ymin']:obj['ymax'],obj['xmin']:obj['xmax']]\n",
    "            patch = cv2.resize(patch, (NORM_H, NORM_W))\n",
    "            patch = patch - np.array([[[103.939, 116.779, 123.68]]])\n",
    "            patch = np.expand_dims(patch, 0)\n",
    "\n",
    "            prediction = model.predict(patch)\n",
    "\n",
    "            # Transform regressed angle\n",
    "            max_anc = np.argmax(prediction[2][0])\n",
    "            anchors = prediction[1][0][max_anc]\n",
    "\n",
    "            if anchors[1] > 0:\n",
    "                angle_offset = np.arccos(anchors[0])\n",
    "            else:\n",
    "                angle_offset = -np.arccos(anchors[0])\n",
    "\n",
    "            wedge = 2.*np.pi/BIN\n",
    "            angle_offset = angle_offset + max_anc*wedge\n",
    "            angle_offset = angle_offset % (2.*np.pi)\n",
    "\n",
    "            angle_offset = angle_offset - np.pi/2\n",
    "            if angle_offset > np.pi:\n",
    "                angle_offset = angle_offset - (2.*np.pi)\n",
    "\n",
    "            line[3] = str(angle_offset)\n",
    "\n",
    "            # Transform regressed dimension\n",
    "            dims = dims_avg['Car'] + prediction[0][0]\n",
    "\n",
    "            line = line + list(dims)\n",
    "\n",
    "            # Write regressed 3D dim and oritent to file\n",
    "            line = ' '.join([str(item) for item in line]) + '\\n'\n",
    "            box3d.write(line)\n",
    "\n",
    "            #cv2.rectangle(img, (obj['xmin'],obj['ymin']), (obj['xmax'],obj['ymax']), (255,0,0), 3)\n",
    "    \n",
    "    #plt.figure(figsize=(10,10))\n",
    "    #plt.imshow(img/255.); plt.show()"
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [conda env:tf]",
   "language": "python",
   "name": "conda-env-tf-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.13"
  },
  "toc": {
   "colors": {
    "hover_highlight": "#DAA520",
    "running_highlight": "#FF0000",
    "selected_highlight": "#FFD700"
   },
   "moveMenuLeft": true,
   "nav_menu": {
    "height": "83px",
    "width": "251px"
   },
   "navigate_menu": true,
   "number_sections": true,
   "sideBar": true,
   "threshold": 4,
   "toc_cell": false,
   "toc_section_display": "block",
   "toc_window_display": true
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
